"""
@Author : 合肥一元数智教育科技有限公司
@Date :  2025/7/9 8:44
@Description : 

"""
import random
import time

import requests
import re
import  csv


class DaTuSpider:
    def __init__(self):
        self.url = 'http://www.daimg.com/photo/tech/list_68_{}.html'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36',
            'Cookie': '__51cke__=; __tins__21295873=%7B%22sid%22%3A%201752021583859%2C%20%22vd%22%3A%208%2C%20%22expires%22%3A%201752023579442%7D; __51laig__=8'
        }
        self.f = open('datu.csv', 'w',encoding='utf-8',newline='')
        self.writer = csv.writer(self.f)
        self.writer.writerow(['two_page_href','img_src','title'])


    def get_html(self, url):
        html = requests.get(url, headers=self.headers).content.decode('gbk')
        self.parse_html(html)

    def parse_html(self, html):
        # 一级页面正则表达式
        one_reg = '<li><a target="_blank" href="(.*?)" .*?><img .*? src="(.*?)" .*?>(.*?)</a></li>'
        r_list = re.findall(one_reg, html, re.S)
        self.save_html(r_list)

    def save_html(self, data):
        # 将数据持久化操csv文件中
        for r in data:
            img_src = r[1]
            response = requests.get(img_src, headers=self.headers)
            with open(f'imgs/{r[2]}.jpg', 'wb') as f:
                f.write(response.content)
            self.writer.writerow(r)

    def run(self):
        for page in range(1,20):
            url = self.url.format(page)
            time.sleep(random.randint(1,5))
            self.get_html(url)


if __name__ == '__main__':
    spider = DaTuSpider()
    spider.run()
